suppressPackageStartupMessages({
# Single cell libraries
library(Seurat)
library(scran)
library(scater)
# Plotting
library(rafalib)
library(cowplot)
library(rgl)
library(plotly)
options(rgl.printRglwidget = TRUE)
# Sparse Matrix manipulation tools
library(Matrix)
library(sparseMatrixStats)
# Trajectory analysis
library(slingshot)
library(tradeSeq)
# Dimensionality reduction
library(destiny)
library(fastICA)
})
# Define some color pallete
pal <- c((scales::hue_pal())(8), RColorBrewer::brewer.pal(9, "Set1"), RColorBrewer::brewer.pal(8,
"Set2"))
set.seed(1)
pal <- rep(sample(pal, length(pal)), 200)Nice function to easily draw a graph:
# Add graph to the base R graphics plot
draw_graph <- function(layout, graph, lwd = 0.2, col = "grey") {
res <- rep(x = 1:(length(graph@p) - 1), times = (graph@p[-1] - graph@p[-length(graph@p)]))
segments(x0 = layout[graph@i + 1, 1], x1 = layout[res, 1], y0 = layout[graph@i +
1, 2], y1 = layout[res, 2], lwd = lwd, col = col)
}In order to speed up the computations during the exercises, we will be using a subset of a bone marrow dataset (originally containing about 100K cells). The bone marrow is the source of adult immune cells, and contains virtually all differentiation stages of cell from the imune system which later circulate in the blood to all other organs.
You can download the files we prepared with these commands:
# webpath <-
# 'https://raw.githubusercontent.com/NBISweden/workshop-scRNAseq/new_dataset/labs/data/bone_marrow/'
# file_list <- c('trajectory_seurat.rds', 'trajectory_scran.rds')
# dir.create('./',recursive = T) for(i in file_list){ download.file( url =
# paste0(webpath,i) , destfile = paste0('./',i))}If you have been using the scran/scater , Seurat or Scanpy pipelines with your own data, you need to reach to the point where can find get:
We already have pre-computed and subseted the dataset (with 6688 cells and 3585 genes) following the anlaysis steps in this course. We then saved the objects, so you can use common tools to open and start to work with them (either in R or Python).
# SEURAT
obj <- readRDS("trajectory_seurat.rds")
# SCRAN
sce <- readRDS("trajectory_scran.rds")Lets visualise which clusters we have in our dataset:
vars <- c("batches", "dataset", "clusters", "Phase")
pl <- list()
# SEURAT
for (i in vars) {
pl[[i]] <- DimPlot(obj, group.by = i, label = T) + theme_void() + NoLegend()
}
plot_grid(plotlist = pl)# SCRAN
for (i in vars) {
pl[[i]] <- plotReducedDim(sce, "umap", colour_by = i, text_by = i) + theme_void() +
NoLegend()
}
plot_grid(plotlist = pl)You can check, for example how many cells are in each cluster:
# SEURAT
table(obj$clusters)##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 128 71 20 109 90 160 147 120 160 123 130 132 78 90 150 140 76 141 90 98
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 149 90 10 33 56 154 98 76 125 44 64 150 150 146 150 148 135 128 79 70
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 145 12 134 110 149 140 113 4 132 85 6 126 129 57 129 137 147 127 118 120
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 101 20 8 20 10 12 6 13 20 5 10 10 10 15
# SCRAN
table(sce$clusters)##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 128 71 20 109 90 160 147 120 160 123 130 132 78 90 150 140 76 141 90 98
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 149 90 10 33 56 154 98 76 125 44 64 150 150 146 150 148 135 128 79 70
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 145 12 134 110 149 140 113 4 132 85 6 126 129 57 129 137 147 127 118 120
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74
## 101 20 8 20 10 12 6 13 20 5 10 10 10 15
It is crutial that you performing analysis of a dataset understands what is going on, what are the clusters you see in your data and most importantly How are the clusters related to each other?. Well, let’s explore the data a bit. With the help of this table, write down which cluster numbers in your dataset express these key markers.
| Marker | Cell Type |
|---|---|
| Cd34 | HSC progenitor |
| Ms4a1 | B cell lineage |
| Cd3e | T cell lineage |
| Ltf | Granulocyte lineage |
| Cst3 | Monocyte lineage |
| Mcpt8 | Mast Cell lineage |
| Alas2 | RBC lineage |
| Siglech | Dendritic cell lineage |
| C1qc | Macrophage cell lineage |
| Pf4 | Megakaryocyte cell lineage |
vars <- c("Cd34", "Ms4a1", "Cd3e", "Ltf", "Cst3", "Mcpt8", "Alas2", "Siglech", "C1qc",
"Pf4")
pl <- list()
# SEURAT
pl <- list(DimPlot(obj, group.by = "clusters", label = T) + theme_void() + NoLegend())
for (i in vars) {
pl[[i]] <- FeaturePlot(obj, features = i, order = T) + theme_void() + NoLegend()
}
plot_grid(plotlist = pl)# SCRAN
pl <- list(plotReducedDim(sce, "umap", colour_by = "clusters", text_by = "clusters") +
theme_void() + NoLegend())
for (i in vars) {
pl[[i]] <- plotReducedDim(sce, "umap", colour_by = i, by_exprs_values = "logcounts") +
theme_void() + NoLegend()
}
plot_grid(plotlist = pl)Another way to better explore your data is look in higher dimensions, to really get a sense for what is right or wrong. As mentioned in the dimensionality reduction exercises, here we ran umap with 3 dimensions (IMPORTANT: the umap needs to be computed to results in exactly 3 dimensions).
Since the steps below are identical to both Seurat and Scran pipelines, we ill extract the matrices from both, so it is clear what is being used where and to remove long lines of code used to get those matrices. We will use them all.
# SEURAT
NORM_COUNTS <- obj@assays$RNA@data
UMAP2 <- obj@reductions$umap@cell.embeddings
UMAP3 <- obj@reductions$umap3d@cell.embeddings
HARMONY <- obj@reductions$harmony_Phase@cell.embeddings
PCA <- obj@reductions$pca@cell.embeddings
PCA_loadings <- obj@reductions$pca@feature.loadings
clustering <- factor(obj$clusters)
KNN <- obj@graphs$knn
# SCRAN
NORM_COUNTS <- logcounts(sce)
UMAP2 <- reducedDim(sce, "umap")
UMAP3 <- reducedDim(sce, "umap3d")
HARMONY <- reducedDim(sce, "harmony_Phase")
PCA <- reducedDim(sce, "pca")
PCA_loadings <- attr(reducedDim(sce, "pca"), "rotation")
clustering <- factor(sce$clusters)
KNN <- reducedDim(sce, "knn")
# Calculate cluster centroids (for plotting the labels later)
mm <- sparse.model.matrix(~0 + factor(clustering))
colnames(mm) <- levels(factor(clustering))
centroids3d <- as.matrix(t(t(UMAP3) %*% mm)/Matrix::colSums(mm))
centroids2d <- as.matrix(t(t(UMAP2) %*% mm)/Matrix::colSums(mm))Plot in 3D with RGL:
# Plot in 3D with RGL
rgl::open3d()## null
## 1
points3d(x = UMAP3[, 1], y = UMAP3[, 2], z = UMAP3[, 3], col = pal[factor(clustering)])
text3d(t(centroids3d[, 1:3]), texts = rownames(centroids3d), cex = 1)try(htmlwidgets::saveWidget(rglwidget(width = 1000, height = 800), selfcontained = T,
"umap_3d_clustering_rgl.html"), silent = T)
browseURL("umap_3d_clustering_rgl.html")
rgl::close3d()Or Plot in 3D with Plotly:
df <- data.frame(UMAP3, variable = clustering)
colnames(df)[1:3] <- c("UMAP_1", "UMAP_2", "UMAP_3")
p_State <- plot_ly(df, x = ~UMAP_1, y = ~UMAP_2, z = ~UMAP_3, color = ~variable,
colors = pal, size = 0.5)
try(htmlwidgets::saveWidget(p_State, selfcontained = T, "umap_3d_clustering_plotly.html"),
silent = T)
browseURL("umap_3d_clustering_plotly.html")
p_StateBefore we take a dive into trajectory itself, we need to clean up the data a bit, so we can get nice figures later. This dataset is already clean in terms of guality of the cells and so on, but as you probably noticed in the clustering exercise, there is always some outlier cells in the “wrong” part of the plot (as in, far away from cell from the same cluster ).
In reality, this is distotion caused by the graph layout algorithm (UMAP in this case) that tries to force the data to be shown in 2 dimensions. If you check the 3D plots, the data points are in the “correct” position closer to other points in the same clusters (there is not a single misplaced point). Still, in most cases not even 3 dimensions is enough to represent the data correctly.
# calculate the distance from every cell to its neighbors
expected_U2d <- t(t(UMAP2) %*% KNN)/colSums2(KNN)
d <- rowSums((expected_U2d - UMAP2)^2)^(1/2)
# Define a distance cutoff
hist(d, breaks = 400)
cutoff <- mean(d) + 5 * sd(d)
abline(v = (cutoff), col = "red", xpd = F)cutoff## [1] 0.7806166
to_keep <- (d < cutoff)
mypar()
plot(UMAP2, type = "n")
draw_graph(layout = UMAP2, graph = KNN)
points(UMAP2, cex = ifelse(!to_keep, 1, 0.3), lwd = ifelse(!to_keep, 2, 0), bg = pal[clustering],
pch = 21)
text(centroids2d, labels = rownames(centroids2d), cex = 1, font = 2)In most cases this doesn’t affect any results, but it affects the trajectories. As there are many ways to handle this, but the simplest is just change of the outlier cells to be closer its “cluster-mates”. We only need to do this for the UMAP 2D layout.
new_UMAP2 <- UMAP2
res <- as.matrix(t(t(UMAP2) %*% KNN)/colSums2(KNN))
new_UMAP2[!to_keep, ] <- res[!to_keep, ]
new_centroids2d <- as.matrix(t(t(new_UMAP2) %*% mm)/Matrix::colSums(mm))And let’s plot the UMAP again:
# Check the umap in 2D
mypar()
plot(new_UMAP2, type = "n")
draw_graph(layout = new_UMAP2, graph = KNN)
points(new_UMAP2, cex = ifelse(!to_keep, 1, 0.3), lwd = ifelse(!to_keep, 2, 0), bg = pal[clustering],
pch = 21)
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 1, font = 2)Much better!
Until up to this point, the steps above have been somewhat covered in the previous lectures. From now on, we will start using that clustering and data reduction techniques for trajectory inference.
Here, the choice upon which dimension to use have a great impact on your results. As explained above and in the dimensionality reduction exercise, using a 2D UMAP to create trajectories is NOT a good idea, it is literally a distorsion of your data! Instead, we should use other multidemnsional representations (more than 2D) for the calculations and only later then visualise in your UMAP 2D.
Here, we can run even some other additional dimensionality reduction methods (ICA and DiffusionMaps) on top of the integrated harmony embedding, where the batch effects were corrected.
Let’s firts explore ICA:
# Computing ICA
ICA_object <- fastICA::fastICA(X = HARMONY, n.comp = 20, method = "C", row.norm = T)## colstandard
ICA <- ICA_object$S
colnames(ICA) <- paste0("ICA_", 1:ncol(ICA))
mypar(3, 3)
plot(new_UMAP2, pch = 16, col = pal[clustering])
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 1, font = 2)
for (i in 1:8) {
cc <- t(t(ICA[, c(i * 2 - 1, i * 2)]) %*% mm)/Matrix::colSums(mm)
plot(ICA[, c(i * 2 - 1, i * 2)], pch = 16, col = pal[clustering])
text(cc[, 1], cc[, 2], labels = rownames(cc), cex = 1, font = 2)
}Let’s now explore DM:
# Computing Diffusion Maps
DM_object <- destiny::DiffusionMap(data = HARMONY, k = 20, sigma = "global")
DM <- DM_object@eigenvectors
colnames(DM) <- paste0("DC_", 1:ncol(DM))
mypar(3, 3)
plot(new_UMAP2, pch = 16, col = pal[clustering])
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 1, font = 2)
for (i in 1:8) {
cc <- t(t(DM[, c(i * 2 - 1, i * 2)]) %*% mm)/Matrix::colSums(mm)
plot(DM[, c(i * 2 - 1, i * 2)], pch = 16, col = pal[clustering])
text(cc[, 1], cc[, 2], labels = rownames(cc), cex = 1, font = 2)
}# DM_object@transitions res <- destiny::DPT(DM_object) destiny::plot.DPT(res)
# res@branchLet run default Slingshot lineage identification on the 2D UMAP to get a first impression about it (for the sake of simplycity), but you are welcome to explore different embeddings and use more dimensions. The whole process can be done using a single function named slingshot, which is simply a wrapper for the 2 main steps for trajectory inference. The first step of the process is to define the lineages and then fit a curve through the data that defines a trajectory. These steps are break down below for clarity.
# Run Slingshot on UMAP3d
set.seed(1)
lineages <- getLineages(data = new_UMAP2, clusterLabels = clustering)
# Change the reduction (FOR VISUALISATION ONLY, in case you use another
# dimension for calculations)
lineages@reducedDim <- new_UMAP2
# Plot the lineages
mypar(1, 2)
plot(new_UMAP2, col = pal[clustering], cex = 0.5, pch = 16)
lines(lineages, lwd = 2, col = "black", cex = 3)
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 0.8, font = 2, col = "white")
# Check the umap in 2D
plot(new_UMAP2, type = "n")
draw_graph(layout = new_UMAP2, graph = KNN)
points(new_UMAP2, cex = ifelse(!to_keep, 1, 0.3), lwd = ifelse(!to_keep, 2, 0), bg = pal[clustering],
pch = 21)
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 0.8, font = 2)Now take a closer look on the object, which lineages you see? Which is the starting cell cluster?
print(lineages)## class: SlingshotDataSet
##
## Samples Dimensions
## 6688 2
##
## lineages: 21
## Lineage1: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 41 72 10 55 59 61 44 60 58 29 8 43 52 47 49 64
## Lineage2: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 41 72 10 55 59 61 44 60 58 29 8 43 52 47 49 50
## Lineage3: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 41 72 10 55 59 61 44 60 58 29 8 43 52 47 62
## Lineage4: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 41 72 10 55 59 61 44 60 58 29 8 43 52 47 69
## Lineage5: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 41 72 10 55 59 61 44 60 58 29 42 56 66
## Lineage6: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 41 72 10 55 59 61 44 60 58 29 42 56 74
## Lineage7: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 26 21 12 20 16 51
## Lineage8: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 41 72 10 55 38 53
## Lineage9: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 41 72 10 55 73
## Lineage10: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 26 21 65
## Lineage11: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 36 37 41 72 33
## Lineage12: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 7 32 6 54 25
## Lineage13: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 7 32 6 54 68
## Lineage14: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 7 32 6 54 71
## Lineage15: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 7 32 6 27
## Lineage16: 70 19 17 22 28 14 13 5 2 1 9 46 15 45 35 11 18 34 57
## Lineage17: 70 19 17 22 28 14 13 5 39 40 4 48 63 24
## Lineage18: 70 19 17 22 28 14 13 5 39 40 31 67
## Lineage19: 70 19 17 22 28 14 13 5 23
## Lineage20: 70 19 17 22 28 14 3
## Lineage21: 70 19 30
##
## curves: 0
Here we see one central issue with trajectory analysis: where does the trajectory begin? Without any extra information, this is nearly an impossible task for any T.I. method. We need prior biological information to be able to define where the trajectory starts and where it should end.
First, we need to make sure to identify which cluster is the progenitor cell. In this case, they express the marker CD34.
| Marker | Cell Type |
|---|---|
| Cd34 | HSC progenitor |
| Ms4a1 | B cell lineage |
| Cd3e | T cell lineage |
| Ltf | Granulocyte lineage |
| Cst3 | Monocyte lineage |
| Mcpt8 | Mast Cell lineage |
| Alas2 | RBC lineage |
| Siglech | Dendritic cell lineage |
# SEURAT
plot_grid(plotlist = list(DimPlot(obj, group.by = "clusters", label = T) + theme_void() +
NoLegend(), FeaturePlot(obj, features = "Cd34", order = T) + theme_void() + NoLegend()))# SCRAN
plot_grid(plotlist = list(plotReducedDim(sce, "umap", colour_by = "clusters", text_by = "clusters") +
theme_void() + NoLegend(), plotReducedDim(sce, "umap", colour_by = "Cd34", by_exprs_values = "logcounts") +
theme_void() + NoLegend()))Then, we can insert that information on where the trajectory starts on the getLineages function.
# Run Slingshot on UMAP3d
set.seed(1)
lineages <- getLineages(data = new_UMAP2,
clusterLabels = clustering,
#end.clus = c("4","3","13","9"), # You can also define the ENDS!
start.clus = "57") # efine where to START the trajectories
# Change the reduction (FOR VISUALISATION ONLY, in case you use another dimension for calculations)
lineages@reducedDim <- new_UMAP2
#Plot the lineages
mypar() ; plot(new_UMAP2, col = pal[clustering], cex=.5,pch = 16)
lines(lineages, lwd = 2, col = 'black', cex=3 )
text(new_centroids2d,labels = rownames(new_centroids2d),cex=0.8,font=2,col = "white")If you compare this plot with the previous, you will not notice many differences, but let’s check
print(lineages)## class: SlingshotDataSet
##
## Samples Dimensions
## 6688 2
##
## lineages: 21
## Lineage1: 57 34 18 36 37 41 72 10 55 59 61 44 60 58 29 8 43 52 47 49 64
## Lineage2: 57 34 18 36 37 41 72 10 55 59 61 44 60 58 29 8 43 52 47 49 50
## Lineage3: 57 34 18 36 37 41 72 10 55 59 61 44 60 58 29 8 43 52 47 62
## Lineage4: 57 34 18 36 37 41 72 10 55 59 61 44 60 58 29 8 43 52 47 69
## Lineage5: 57 34 18 11 35 45 15 46 9 1 2 5 13 14 28 22 17 19 70
## Lineage6: 57 34 18 11 35 45 15 46 9 1 2 5 13 14 28 22 17 19 30
## Lineage7: 57 34 18 11 35 45 15 46 9 1 2 5 39 40 4 48 63 24
## Lineage8: 57 34 18 36 37 41 72 10 55 59 61 44 60 58 29 42 56 66
## Lineage9: 57 34 18 36 37 41 72 10 55 59 61 44 60 58 29 42 56 74
## Lineage10: 57 34 18 11 35 45 15 46 9 1 2 5 39 40 31 67
## Lineage11: 57 34 18 11 35 45 15 46 9 1 2 5 13 14 3
## Lineage12: 57 34 18 11 35 45 15 46 9 1 2 5 23
## Lineage13: 57 34 18 36 37 26 21 12 20 16 51
## Lineage14: 57 34 18 36 37 41 72 10 55 38 53
## Lineage15: 57 34 18 11 35 7 32 6 54 25
## Lineage16: 57 34 18 11 35 7 32 6 54 68
## Lineage17: 57 34 18 11 35 7 32 6 54 71
## Lineage18: 57 34 18 36 37 41 72 10 55 73
## Lineage19: 57 34 18 11 35 7 32 6 27
## Lineage20: 57 34 18 36 37 26 21 65
## Lineage21: 57 34 18 36 37 41 72 33
##
## curves: 0
What changed?
As you have probably noticed, there are many other small clusters in the dataset that do not express those markers. Those are other less abundant cell types, which the intermediate cell states were NOT present (because there were not enought cells captured)! Slingshot tries to fit a model / MST (minimum spanning tree) on the whole data, so for this example, we can look at the KNN graph to help us filter out some cells that can’t estimate trajectories with confidence.
Here, we will also remove some other small clusters just for the sake of simplicity on the steps below. In practice, sometimes it is easier to work with parts of the dataset at a time, in order to explore trajectories in more details.
mypar()
plot(new_UMAP2, col = pal[clustering], cex = 0.5, pch = 16)
draw_graph(layout = new_UMAP2, graph = KNN)
text(new_centroids2d, labels = rownames(new_centroids2d), cex = 1, font = 2)# Define clusters to filter
sort(table(clustering))## clustering
## 48 70 51 67 63 23 65 71 72 73 42 66 68 74 3 62 64 69 24 30
## 4 5 6 6 8 10 10 10 10 10 12 12 13 15 20 20 20 20 33 44
## 25 54 31 40 2 17 28 13 39 50 5 14 19 22 20 27 61 4 44 47
## 56 57 64 70 71 76 76 78 79 85 90 90 90 90 98 98 101 109 110 113
## 59 8 60 10 29 52 58 1 38 53 55 11 12 49 43 37 56 16 46 18
## 118 120 120 123 125 126 127 128 128 129 129 130 132 132 134 135 137 140 140 141
## 41 34 7 57 36 21 45 15 32 33 35 26 6 9
## 145 146 147 147 148 149 149 150 150 150 150 154 160 160
clusters_to_remove <- c("71", "65", "73", "66", "56", "74", "42", "69", "62", "40",
"30", "68", "70", "64", "67", "51", "24", "63", "48", "3", "10", "4", "31", "72",
"39")
cell_to_keep <- !(clustering %in% clusters_to_remove)
# Filtering clusters
filt_new_UMAP2 <- new_UMAP2[cell_to_keep, ]
filt_UMAP3 <- UMAP3[cell_to_keep, ]
filt_NORM_COUNTS <- NORM_COUNTS[, cell_to_keep]
filt_PCA <- PCA[cell_to_keep, ]
filt_HARMONY <- HARMONY[cell_to_keep, ]
filt_KNN <- KNN[cell_to_keep, cell_to_keep]
filt_clustering <- factor(clustering[cell_to_keep])
filt_new_centroids2d <- as.matrix(new_centroids2d[!(rownames(new_centroids2d) %in%
clusters_to_remove), ])
filtcentroids3d <- as.matrix(centroids3d[!(rownames(centroids3d) %in% clusters_to_remove),
])
# Plot
mypar()
plot(filt_new_UMAP2, type = "n")
draw_graph(layout = filt_new_UMAP2, graph = filt_KNN)
points(filt_new_UMAP2, col = pal[filt_clustering], cex = 0.5, pch = 16)
text(filt_new_centroids2d[, 1:2], labels = rownames(filt_new_centroids2d), cex = 1,
font = 1)Another issue is related to the clustering resolution used. For trajectories, it is better to use fine-grain resolution to separate intermediate cell types. Again, there are many approaches to that, and here we will use the KNN graph to compute the amount of connections between clusters. If the amount of connections is above a certain threshold, we then merge those clusters. IMPORTANT: The code below is very usefull in many other single cell data analysis ocasions!
# Compute connections between clusters on a graph
filt_mm <- sparse.model.matrix(~0 + factor(filt_clustering))
colnames(filt_mm) <- levels(factor(filt_clustering))
d <- t(filt_KNN %*% filt_mm) %*% filt_mm/(t(t(colSums(filt_mm))) %*% t(colSums(filt_mm)))^(1/2)
diag(d) <- 0
d <- drop0(d)
pheatmap::pheatmap(d, clustering_method = "ward.D2")# Merging similar clusters clusters
hist(d@x, breaks = 50)
cutoff <- 1.2 * (sum((d@x^2))/(length(d@x) - 1))^(1/2)
abline(v = (cutoff), col = "red", xpd = F)cutoff## [1] 2.536375
to_merge <- drop0((d > cutoff) * 1)
to_merge <- to_merge * lower.tri(to_merge)
diag(to_merge)[rowSums(to_merge) == 0] <- 1
# Plot cluster mappings
pheatmap::pheatmap(to_merge, cluster_rows = F, cluster_cols = F)# Merge the cluster labels
mappings <- cbind(from = colnames(to_merge)[to_merge@i + 1], to = colnames(to_merge)[rep(x = 1:(length(to_merge@p) -
1), times = (to_merge@p[-1] - to_merge@p[-length(to_merge@p)]))])
merged_filt_clustering <- factor(mappings[match(filt_clustering, mappings[, 1]),
2])
merged_filt_mm <- sparse.model.matrix(~0 + factor(merged_filt_clustering))
colnames(merged_filt_mm) <- levels(factor(merged_filt_clustering))
merged_filt_new_centroids2d <- as.matrix(t(t(filt_new_UMAP2) %*% merged_filt_mm)/Matrix::colSums(merged_filt_mm))
# Plot the new clusterings
mypar()
plot(filt_new_UMAP2, type = "n")
draw_graph(layout = filt_new_UMAP2, graph = filt_KNN)
points(filt_new_UMAP2, col = pal[merged_filt_clustering], cex = 0.5, pch = 16)
text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),
cex = 1, font = 1)We can now compute the lineages on these filtered and re-grouped data. Note that the cluster containing our “Cd34” marker also have changed.
# Define lineage ends
ENDS <- c("17","27","25","16","26","53","49")
set.seed(1)
lineages <- getLineages(data = filt_new_UMAP2,
clusterLabels = merged_filt_clustering,
end.clus = ENDS, # You can also define the ENDS!
start.clus = "34") # efine where to START the trajectories
# IF NEEDED, ONE CAN ALSO MANULALLY EDIT THE LINEAGES, FOR EXAMPLE:
# sel <- sapply( lineages@lineages, function(x){rev(x)[1]} ) %in% ENDS
# lineages@lineages <- lineages@lineages[ sel ]
# names(lineages@lineages) <- paste0("Lineage",1:length(lineages@lineages))
# lineages
# Change the reduction to our "fixed" UMAP2d (FOR VISUALISATION ONLY)
lineages@reducedDim <- filt_new_UMAP2
mypar() ; plot(filt_new_UMAP2, col = pal[merged_filt_clustering], cex=.5,pch = 16)
lines(lineages, lwd = 1, col = 'black', cex=2 )
text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),cex=0.8,font=2,col = "white")Much better!
Once the clusters are connected, Slingshot allows you to transform them to a smooth trajectory using principal curves. This is an algorithm that iteratively changes an initial curve to better match the data points. It was developed for linear data. To apply it to single-cell data, slingshot adds two enhancements:
Since the function getCurves() takes some time to run, we can speed up the convergence of the curve fitting process by reducing the amount of cells to use in each lineage. Ideally you could all cells, but here we had set approx_points to 300 to speed up. Feel free to adjust that for your dataset.
# Define curves
curves <- getCurves(lineages, thresh = 0.1, stretch = 0.1, allow.breaks = F, approx_points = 100)
curves## class: SlingshotDataSet
##
## Samples Dimensions
## 5828 2
##
## lineages: 7
## Lineage1: 34 18 36 33 55 59 44 60 58 29 8 43 47 49
## Lineage2: 34 18 11 35 15 46 9 1 2 5 13 28 17
## Lineage3: 34 18 11 35 7 32 6 54 25
## Lineage4: 34 18 11 35 7 32 6 27
## Lineage5: 34 18 36 21 12 20 16
## Lineage6: 34 18 36 33 55 38 53
## Lineage7: 34 18 36 26
##
## curves: 7
## Curve1: Length: 6.7209 Samples: 2148.23
## Curve2: Length: 7.3478 Samples: 2108.92
## Curve3: Length: 3.463 Samples: 1285.44
## Curve4: Length: 2.546 Samples: 1203.13
## Curve5: Length: 2.9266 Samples: 971.56
## Curve6: Length: 2.8915 Samples: 1071.83
## Curve7: Length: 2.1368 Samples: 637.54
# Plots
plot(filt_new_UMAP2, col = pal[merged_filt_clustering], pch = 16)
lines(curves, lwd = 2, col = "black")
text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),
cex = 1, font = 2)With those results in hands, we can now compute the differentiation pseudotime.
pseudotime <- slingPseudotime(curves, na = FALSE)
cellWeights <- slingCurveWeights(curves)
x <- rowMeans(pseudotime)
x <- x/max(x)
o <- order(x)
mypar()
plot(filt_new_UMAP2[o, ], main = paste0("pseudotime"), pch = 16, cex = 0.4, axes = F,
xlab = "", ylab = "", col = colorRampPalette(c("grey70", "orange3", "firebrick",
"purple4"))(99)[x[o] * 98 + 1])
points(merged_filt_new_centroids2d, cex = 2.5, pch = 16, col = "#FFFFFF99")
text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),
cex = 1, font = 2)IMPORTANT: The pseudotime represents the distance of every cell to the starting cluster!
Before computing differential gene expression, it is a good idea to make sure our dataset is somewhat homogeneous (without very strong batch effects). In this dataset, we actually used data from 4 different technologies (Drop-seq, SmartSeq2 and 10X) and therefore massive differences in read counts can be observed:
# SEURAT
VlnPlot(obj, features = "nUMI", group.by = "batches")# SCRAN
plotColData(sce, y = "nUMI", x = "batches", colour_by = "batches")Since we are not interested in the effects of the batches in this example, but only the differentiation paths for each cell type. We can use the integrated space of harmony embeddings (where we removed batch effects). Since the harmony (same applies to MNN, scanorama, LIGER ) is a coorected verion of PCA, we can multiply the harmony embbedings with PCA loadings to generate batch-corrected “pseudocounts”. Note that we can only reconstruct data from the highly variable genes that were used to compute PCA and HARMONY.
# Get the gene means and standar deviation
library(sparseMatrixStats)
genes <- rownames(PCA_loadings)
gene_means <- rowMeans2(filt_NORM_COUNTS[genes, ])
gene_sd <- sqrt(rowVars(filt_NORM_COUNTS[genes, ]))
# Project normalized gene counts
CORRECTED_NORMCOUNTS <- t(filt_HARMONY %*% t(PCA_loadings)) * gene_sd + gene_means -
0.02
CORRECTED_NORMCOUNTS <- Matrix(round(CORRECTED_NORMCOUNTS, 3), sparse = T)
CORRECTED_NORMCOUNTS@x[CORRECTED_NORMCOUNTS@x < 0] <- 0
CORRECTED_NORMCOUNTS <- drop0(CORRECTED_NORMCOUNTS)
# Transform the normalized data back to raw counts (used for differential
# expression)
CORRECTED_COUNTS <- round((expm1(CORRECTED_NORMCOUNTS)) * 1000)Let’s compare how the normalized data compares to the batch-correceted one.
mypar(3, 3)
plot(filt_new_UMAP2, type = "n")
draw_graph(layout = filt_new_UMAP2, graph = filt_KNN)
points(filt_new_UMAP2, col = pal[filt_clustering], pch = 16)
text(merged_filt_new_centroids2d[, 1], merged_filt_new_centroids2d[, 2], labels = rownames(merged_filt_new_centroids2d),
cex = 0.8, font = 2)
vars <- c("Cd34", "Ms4a1", "Cd3e", "Ltf", "Cst3", "Mcpt8", "Alas2", "Siglech")
for (i in vars) {
plot(filt_NORM_COUNTS[i, ], CORRECTED_NORMCOUNTS[i, ], main = i, pch = 16, cex = 0.4)
rr <- c(diff(range(filt_NORM_COUNTS[i, ]))/50, (range(CORRECTED_NORMCOUNTS[i,
])))
polygon(c(-rr[1], -rr[1], rr[1], rr[1]), c(rr[3], rr[2], rr[2], rr[3]), border = "red")
text(rr[1], max(CORRECTED_NORMCOUNTS[i, ]), " < Imputed\n counts", adj = c(0,
1), col = "red", font = 2)
}IMPORTANT: Please note in the graphs above that there is a significan amount of imputation (i.e., we artificially add counts to certain cells where we’d expect to see ). Please keep this in mind and use these matrices with caution in downstream analysis!
Let’s also take a closer inspection on the UMAPs:
mypar(4, 5, mar = c(0.1, 0.1, 2, 1))
vars <- c("Cd34", "Ms4a1", "Cd3e", "Ltf", "Cst3", "Mcpt8", "Alas2", "Siglech", "C1qc")
for (j in c("filt_NORM_COUNTS", "CORRECTED_NORMCOUNTS")) {
plot(filt_new_UMAP2, type = "n", axes = F, xlab = "", ylab = "", main = j)
draw_graph(layout = filt_new_UMAP2, graph = filt_KNN)
points(filt_new_UMAP2, col = pal[merged_filt_clustering], pch = 16)
text(merged_filt_new_centroids2d, labels = rownames(merged_filt_new_centroids2d),
cex = 0.8, font = 2)
for (i in vars) {
x <- get(j)[i, ]
x <- x - min(x)/(max(x) - min(x))
o <- order(x)
plot(filt_new_UMAP2[o, ], main = paste0(i), pch = 16, cex = 0.4, axes = F,
xlab = "", ylab = "", col = colorRampPalette(c("lightgray", "blue"))(99)[x[o] *
98 + 1])
}
}The main way to interpret a trajectory is to find genes that change along the trajectory. There are many ways to define differential expression along a trajectory:
tradeSeq is a recently proposed algorithm to find trajectory differentially expressed genes. It works by smoothing the gene expression along the trajectory by fitting a smoother using generalized additive models (GAMs), and testing whether certain coefficients are statstically different between points in the trajectory.
BiocParallel::register(BiocParallel::MulticoreParam())The fitting of GAMs can take quite a while, so for demonstration purposes we first do a very stringent filtering of the genes.
IMPORTANT: In an ideal experiment, you would use all the genes, or at least those defined as being variable.
sel_cells <- split(colnames(CORRECTED_COUNTS), merged_filt_clustering)
sel_cells <- unlist(lapply(sel_cells, function(x) {
set.seed(1)
return(sample(x, 20))
}))
gv <- as.data.frame(na.omit(modelGeneVar(CORRECTED_NORMCOUNTS[, sel_cells])))
gv <- gv[order(gv$bio, decreasing = T), ]
sel_genes <- sort(rownames(gv)[1:500])Fitting the model:
sceGAM <- tradeSeq::fitGAM(counts = drop0(CORRECTED_COUNTS[sel_genes, sel_cells]),
pseudotime = pseudotime[sel_cells, ], cellWeights = cellWeights[sel_cells, ],
nknots = 10, verbose = T, parallel = T, BPPARAM = BiocParallel::MulticoreParam())##
|
| | 0%
|
|== | 2%
|
|==== | 5%
|
|===== | 8%
|
|======= | 10%
|
|========= | 12%
|
|========== | 15%
|
|============ | 18%
|
|============== | 20%
|
|================ | 22%
|
|================== | 25%
|
|=================== | 28%
|
|===================== | 30%
|
|======================= | 32%
|
|======================== | 35%
|
|========================== | 38%
|
|============================ | 40%
|
|============================== | 42%
|
|================================ | 45%
|
|================================= | 48%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 55%
|
|======================================== | 58%
|
|========================================== | 60%
|
|============================================ | 62%
|
|============================================== | 65%
|
|=============================================== | 68%
|
|================================================= | 70%
|
|=================================================== | 72%
|
|==================================================== | 75%
|
|====================================================== | 78%
|
|======================================================== | 80%
|
|========================================================== | 82%
|
|============================================================ | 85%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|================================================================= | 92%
|
|================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
plotGeneCount(curves, clusters = merged_filt_clustering, models = sceGAM)lineages## class: SlingshotDataSet
##
## Samples Dimensions
## 5828 2
##
## lineages: 7
## Lineage1: 34 18 36 33 55 59 44 60 58 29 8 43 47 49
## Lineage2: 34 18 11 35 15 46 9 1 2 5 13 28 17
## Lineage3: 34 18 11 35 7 32 6 54 25
## Lineage4: 34 18 11 35 7 32 6 27
## Lineage5: 34 18 36 21 12 20 16
## Lineage6: 34 18 36 33 55 38 53
## Lineage7: 34 18 36 26
##
## curves: 0
lc <- sapply(lineages@lineages, function(x) {
rev(x)[1]
})
names(lc) <- gsub("Lineage", "L", names(lc))
mypar()
plot(filt_new_UMAP2, col = pal[merged_filt_clustering], pch = 16)
lines(curves, lwd = 2, col = "black")
points(merged_filt_new_centroids2d[lc, ], col = "black", pch = 16, cex = 4)
text(merged_filt_new_centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white")We can first look at general trends of gene expression across pseudotime.
res <- na.omit(associationTest(sceGAM))
res <- res[res$pvalue < 0.001, ]
res <- res[res$waldStat > mean(res$waldStat), ]
res <- res[order(res$waldStat, decreasing = T), ]
res[1:10, ]We can plot their expression:
mypar(4, 4, mar = c(0.1, 0.1, 2, 1))
plot(filt_new_UMAP2, col = pal[merged_filt_clustering], cex = 0.5, pch = 16, axes = F,
xlab = "", ylab = "")
lines(curves, lwd = 2, col = "black")
points(merged_filt_new_centroids2d[lc, ], col = "black", pch = 15, cex = 3, xpd = T)
text(merged_filt_new_centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white",
xpd = T)
vars <- rownames(res[1:15, ])
for (i in vars) {
x <- CORRECTED_NORMCOUNTS[i, ]
x <- (x - min(x))/(max(x) - min(x))
o <- order(x)
plot(filt_new_UMAP2[o, ], main = paste0(i), pch = 16, cex = 0.5, axes = F, xlab = "",
ylab = "", col = colorRampPalette(c("lightgray", "grey60", "navy"))(99)[x[o] *
98 + 1])
}We can define custom pseudotime values of interest if we’re interested in genes that change between particular point in pseudotime. By default, we can look at differences between start and end:
res <- na.omit(startVsEndTest(sceGAM, pseudotimeValues = c(0, 1)))
res <- res[res$pvalue < 0.001, ]
res <- res[res$waldStat > mean(res$waldStat), ]
res <- res[order(res$waldStat, decreasing = T), ]
res[1:10, 1:6]You can see now that there are several more columns, one for each lineage. This table represents the differential expression within each lineage, to identify which genes go up or down. Let’s check lineage 1:
# Get the top UP and Down regulated in lineage 1
res_lin1 <- sort(setNames(res$logFClineage1, rownames(res)))
vars <- names(c(rev(res_lin1)[1:7], res_lin1[1:8]))
mypar(4, 4, mar = c(0.1, 0.1, 2, 1))
plot(filt_new_UMAP2, col = pal[merged_filt_clustering], cex = 0.5, pch = 16, axes = F,
xlab = "", ylab = "")
lines(curves, lwd = 2, col = "black")
points(merged_filt_new_centroids2d[lc, ], col = "black", pch = 15, cex = 3, xpd = T)
text(merged_filt_new_centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white",
xpd = T)
for (i in vars) {
x <- CORRECTED_NORMCOUNTS[i, ]
x <- (x - min(x))/(max(x) - min(x))
o <- order(x)
plot(filt_new_UMAP2[o, ], main = paste0(i), pch = 16, cex = 0.5, axes = F, xlab = "",
ylab = "", col = colorRampPalette(c("lightgray", "grey60", "navy"))(99)[x[o] *
98 + 1])
}More interesting are genes that are different between two branches. We may have seen some of these genes already pop up in previous analyses of pseudotime. There are several ways to define “different between branches”, and each have their own functions:
diffEndTestearlyDETestpatternTestres <- na.omit(diffEndTest(sceGAM))
res <- res[res$pvalue < 0.001, ]
res <- res[res$waldStat > mean(res$waldStat), ]
res <- res[order(res$waldStat, decreasing = T), ]
res[1:10, 1:6]You can see now that there are even more columns, one for the pair-wise comparisson between each lineage. Let’s check lineage 1 vs lineage 2:
# Get the top UP and Down regulated in lineage 1 vs 2
res_lin1_2 <- sort(setNames(res$logFC1_2, rownames(res)))
vars <- names(c(rev(res_lin1_2)[1:7], res_lin1_2[1:8]))
mypar(4, 4, mar = c(0.1, 0.1, 2, 1))
plot(filt_new_UMAP2, col = pal[merged_filt_clustering], cex = 0.5, pch = 16, axes = F,
xlab = "", ylab = "")
lines(curves, lwd = 2, col = "black")
points(merged_filt_new_centroids2d[lc, ], col = "black", pch = 15, cex = 3, xpd = T)
text(merged_filt_new_centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white",
xpd = T)
for (i in vars) {
x <- CORRECTED_NORMCOUNTS[i, ]
x <- (x - min(x))/(max(x) - min(x))
o <- order(x)
plot(filt_new_UMAP2[o, ], main = paste0(i), pch = 16, cex = 0.5, axes = F, xlab = "",
ylab = "", col = colorRampPalette(c("lightgray", "grey60", "navy"))(99)[x[o] *
98 + 1])
}Check out this vignette for a more in-depth overview of tradeSeq and many other differential expression tests.
Cannoodt, Robrecht, Wouter Saelens, and Yvan Saeys. 2016. “Computational Methods for Trajectory Inference from Single-Cell Transcriptomics.” European Journal of Immunology 46 (11): 2496–2506. doi.
Saelens, Wouter, Robrecht Cannoodt, Helena Todorov, and Yvan Saeys. 2019. “A Comparison of Single-Cell Trajectory Inference Methods.” Nature Biotechnology 37 (5): 547–54. doi.
You can recreate your object in Seurat like so:
# # Create object with counts and metadata obj <- CreateSeuratObject( counts =
# Read10X_h5('./data/subset_BM_counts.h5') , meta.data =
# read.csv2('./data/subset_BM_metadata.csv',row.names = 1)) # Add normalized
# counts and variable features obj@assays$RNA@data <-
# Read10X_h5('./data/subset_BM_normdata.h5') obj@assays$RNA@var.features <-
# read.csv2('./data/subset_BM_var_features.csv',row.names = 1) # Add reductions
# and PCA loadings for(i in
# c('pca','harmony','harmony_Phase','umap','umap3d')){ obj@reductions[[i]] <-
# CreateDimReducObject( embeddings =
# as.matrix(Read10X_h5(paste0('./data/subset_BM_',i,'.h5'))) ,key =
# paste0(i,'_'))} obj@reductions$pca@feature.loadings <-
# as.matrix(Read10X_h5(paste0('./data/subset_BM_pca_feature_loadings.h5'))) #
# Add KNN graph obj@graphs[['knn']] <-
# Read10X_h5(paste0('./data/subset_BM_knn.h5')) obj
# saveRDS(obj,'trajectory_seurat.rds')You can recreate your object in Scran like so:
# sce <- SingleCellExperiment( assays=list(
# counts=Read10X_h5('./data/subset_BM_counts.h5'),
# logcounts=Read10X_h5('./data/subset_BM_normdata.h5')),
# reducedDims=SimpleList(
# pca=as.matrix(Read10X_h5(paste0('./data/subset_BM_pca.h5'))),
# harmony=as.matrix(Read10X_h5(paste0('./data/subset_BM_harmony.h5'))),
# harmony_Phase=as.matrix(Read10X_h5(paste0('./data/subset_BM_harmony_Phase.h5'))),
# umap=as.matrix(Read10X_h5(paste0('./data/subset_BM_umap.h5'))),
# umap3d=as.matrix(Read10X_h5(paste0('./data/subset_BM_umap3d.h5'))),
# knn=Read10X_h5(paste0('./data/subset_BM_knn.h5'))) ) sce@colData <-
# DataFrame(read.csv2('./data/subset_BM_metadata.csv',row.names = 1)) attr(x =
# reducedDim(sce,'pca'),which = 'rotation') <-
# as.matrix(Read10X_h5(paste0('./data/subset_BM_pca_feature_loadings.h5')))
# saveRDS(sce,'trajectory_scran.rds')